What kind of people survived for example kids,women,men
As the problem statement itself carring the two set of Data training data,test data Here we will just have a look of what kind of data it is .
library(dplyr)
library(knitr)
library(DT)
train_df<-read.csv("Titanic_DataSet/train.csv",stringsAsFactors=F)
cat(paste('\ntest data structure having rows :\n', nrow(train_df)) )
test data structure having rows :
891
str(train_df)
'data.frame': 891 obs. of 12 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
$ Sex : chr "male" "female" "female" "female" ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : chr "" "C85" "" "C123" ...
$ Embarked : chr "S" "C" "S" "S" ...
test_df<-read.csv("Titanic_DataSet/test.csv",stringsAsFactors=F)
cat(paste('\ntest data structure having rows :\n', nrow(test_df)) )
test data structure having rows :
418
str(test_df)
'data.frame': 418 obs. of 11 variables:
$ PassengerId: int 892 893 894 895 896 897 898 899 900 901 ...
$ Pclass : int 3 3 2 3 3 3 3 2 3 3 ...
$ Name : chr "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
$ Sex : chr "male" "female" "male" "male" ...
$ Age : num 34.5 47 62 27 22 14 30 26 18 21 ...
$ SibSp : int 0 1 0 0 1 0 0 1 0 2 ...
$ Parch : int 0 0 0 0 1 0 0 1 0 0 ...
$ Ticket : chr "330911" "363272" "240276" "315154" ...
$ Fare : num 7.83 7 9.69 8.66 12.29 ...
$ Cabin : chr "" "" "" "" ...
$ Embarked : chr "Q" "S" "Q" "S" ...
datatable(train_df,options=list(pageLength=5))
cat('\n test_df ----\n')
test_df ----
datatable(test_df,options=list(pageLength=5))
Insights:
As from the above result we can see that training Data structure is having one field extra that is “survived” as comapred to testing data which we have to predict in this data
PasseengerId:Unique Id Assigned to each traveller
Pclass : 1 for First Class 2 for Second Class 3 for third class
Name : Name is with first name and Last name Mostly starting from the Mr.Miss And Mrs. Few caseswith Master and don etc .
Age : Age of Passenger, few values are missing here
SibSp : Number of sibling or spouse 0 for none
Parch : Number of parents/childern travelling 0 for none
Ticket : Ticket Number
Fare : how much each traveller paid for the travel
Cabin : Cabin number allocated
Embarked: Port of emabarkation C,S,Q
Now we will start exploring the data
First we will do Univariate analysis,which means finding the continous and categorical variable Continous Variable are : Passenger Id, PClass, Age,Sibsp,Parch,Ticekt Number ,Fare Categorical : Sex, Embarked Character value Variable : Name ,Cabin,Ticekt
Get here the summary of relavant data like,pclass,age,sibsp,parch,fare to see the average
library(ggplot2)
library(plotly)
library(grid)
library(gridExtra)
cat('\nSummary Of Pclass\n')
Summary Of Pclass
par(mfrow=c(1,2))
#Summary for Pclass
ggplotly (ggplot(train_df, aes(x="",y = Pclass)) +
geom_boxplot(colour = "red", fill = "skyblue")+ggtitle("Pclass Summary"))
#Summary for Age
ggplotly (ggplot(train_df, aes(x="",y = Age)) +
geom_boxplot(colour = "red", fill = "skyblue")+ggtitle("Age Summary"))
Removed 177 rows containing non-finite values (stat_boxplot).
#Summary for SibSp
ggplotly (ggplot(train_df, aes(x="",y = SibSp)) +
geom_boxplot(colour = "red", fill = "skyblue")+ggtitle("SibSp Summary"))
#Summary for Parch
ggplotly (ggplot(train_df, aes(x="",y = Parch)) +
geom_boxplot(colour = "red", fill = "skyblue")+ggtitle("Parch Summary"))
#Summary for Fare
ggplotly (ggplot(train_df, aes(x="",y = Fare)) +
geom_boxplot(colour = "red", fill = "skyblue")+ggtitle("Fare Summary"))
Insight:
From the Results few things can be identified
Average age of traveller is around 29-30 and maximum age is 80
very few people were traveling with siblings and maximum number of siblings/spouse is 8
Maximum number of parents/childern is 6 and more outl
Average Fare is around 14.45 but outliers are more so prediction from fare will be tough
let see,from the given if we can find the Fare of first class, second class and third class Not sure it is required or not.
fare_single_passenger<-subset(train_df,(train_df$SibSp == 0) & (train_df$Parch ==0),select=c(Pclass,Fare))
fare_single_passenger
fare_single_passenger<-aggregate(fare_single_passenger[, 2], list(fare_single_passenger$Pclass), mean)
cat(paste('\n\nAverage Fare of First Class ',fare_single_passenger[1,2]))
Average Fare of First Class 63.6725137614679
cat(paste('\n\nAverage Fare of Second Class ',fare_single_passenger[2,2]))
Average Fare of Second Class 14.0661057692308
cat(paste('\n\nAverage Fare of Third Class ',fare_single_passenger[3,2]))
Average Fare of Third Class 9.27205185185185
Change the variable into numeric to fit on continous scale
train_df$Sex<-as.numeric(as.factor(train_df$Sex))
train_df$Embarked<-as.numeric(as.factor(train_df$Embarked))
test_df$Sex<-as.numeric(as.factor(test_df$Sex))
test_df$Embarked<-as.numeric(as.factor(test_df$Embarked))
further will visualize the data between survival and other variates .
here i am using the bar plot and manipulate so that user can choose the column value,number of row and see the respective bar plot
library(ggplot2)#library loadedr
library(ggthemes)
library(manipulate)
############# manipulate for the size and aes value
manipulate(
ggplot(data=train_df[sample(1:nrow(train_df),samplesize),], aes_string(x = x_axis, y = "Survived",fill=x_axis,size=samplesize)) +
geom_bar(stat="identity",width=0.5)+
scale_fill_gradient(low="blue", high="red")+
theme_solarized() ,
samplesize=slider(100,nrow(train_df)),
x_axis=picker("Pclass","Age","Sex","SibSp","Parch","Embarked")
)
###################
Insights:
1.From the above few points number female who survived is more as compared to male
Check the survival of group/family travelling together and singleton here i will use the ticket Number as if people are sharing the same ticket Number ,more chances that they are family or group travelling together .
library(DT)
lenofUniqueTicekt<-length(unique(train_df$Ticket))
cat(paste('Number of uniques Ticket',lenofUniqueTicekt))
Number of uniques Ticket 681
FamGp_df<-train_df[duplicated(train_df[,"Ticket"])|duplicated(train_df[,"Ticket"],fromLast = TRUE),] %>%
arrange(Ticket) %>%group_by(Ticket) %>% summarise(groupFmSize=n(),Survived=sum(Survived))
datatable(FamGp_df, options = list(pageLength = 5))
From the above table looks like possiblity of survival is little better if travelling with group/Family
before going into more visulaization just wanted to check which variable is playing a significant role in predicticing the survival applying logistics regression as the dependent variable is in binary form (0,1 ) means survived or not . Removing few columns like name passengerID ticket,cabin as name and passenger ID is unique and cabin is having lots of missing data .Ticket is again character value not worth .
training_model <- glm(Survived ~.-(Name+PassengerId +Ticket+Cabin),family=binomial(link='logit'),data=train_df,maxit=50)
summary(training_model)
Call:
glm(formula = Survived ~ . - (Name + PassengerId + Ticket + Cabin),
family = binomial(link = "logit"), data = train_df, maxit = 50)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7278 -0.6439 -0.3824 0.6243 2.4564
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 8.598284 0.849163 10.126 < 2e-16 ***
Pclass -1.222334 0.163604 -7.471 7.94e-14 ***
Sex -2.620535 0.219879 -11.918 < 2e-16 ***
Age -0.043306 0.008196 -5.284 1.27e-07 ***
SibSp -0.360270 0.127584 -2.824 0.00475 **
Parch -0.057472 0.123335 -0.466 0.64123
Fare 0.001469 0.002527 0.581 0.56107
Embarked -0.180037 0.131888 -1.365 0.17223
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 964.52 on 713 degrees of freedom
Residual deviance: 633.95 on 706 degrees of freedom
(177 observations deleted due to missingness)
AIC: 649.95
Number of Fisher Scoring iterations: 5
From the above result we can see the following
1.SibSp, Fare and Embarked are not statistically significant. 2.Sex has lowest p-Value indicating strong association between the gender and survival rate
Hence taking few Few more visualization related to Pclass,Sex,Age using plotly
library(ggplot2)
library(plotly)
#Draw thw chart for Male and female survivela
p<-ggplot(train_df, aes(Age, fill = factor(Survived))) +
geom_histogram() +
facet_grid(.~Sex)+theme_dark()
ggplotly(p)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Removed 177 rows containing non-finite values (stat_bin).
library(crosstalk)
shared_train_df <- SharedData$new(train_df)
bscols(d3scatter(shared_train_df,~SibSp,~Age,~Sex,width="100%",height = 300),d3scatter(shared_train_df,~Parch,~Age,~Sex,width = "100%",height = 300)
)
Insights
Clearly from the graph that female of age between 12-42 survived and very few male survived between the same age group so prediction make better add few more column like child ,FemaleAdult ,maleAdult category as they have Female adult high possiblity of survival.
Here on adding both the data(training and test ), as need to filled the missing values in both .
#combine both the data
library(dplyr)
titanic_df<-bind_rows(train_df,test_df)
#check the totral number of rows just to verify
cat(paste('\ncomplete data structure having rows :', nrow(titanic_df)) )
#checking the structure as well
str(titanic_df)
Add the Family size to each person
library(dplyr)
library(DT)
nrow((titanic_df))
#again recreate the Family/Group Size with whole data as we are going to add the family size in the test data to make prediction better
FamGp_df<-titanic_df[duplicated(titanic_df[,"Ticket"])|duplicated(titanic_df[,"Ticket"],fromLast = TRUE),] %>%
arrange(Ticket) %>%group_by(Ticket) %>% summarise(groupFmSize=n(),Survived=sum(Survived))
#keep only two column Ticket ,groupFmSize
FamGp_df <-FamGp_df[,c("Ticket","groupFmSize")]
#display the data
datatable(FamGp_df, options = list(pageLength = 5))
From the above we have got family/group size which we will merge the groupFmSize to get the size in the whole
library(dplyr)
titanic_df<-merge(x=titanic_df,y=FamGp_df,by="Ticket",all=TRUE)
#just to cross verify check the number of rows
cat(paste('\ncomplete data structure having rows :', nrow(titanic_df)) )
#reorder the column value
col_order<-c("PassengerId","Survived","Pclass","Name","Sex","Age","SibSp","Parch","Ticket","groupFmSize","Fare","Cabin","Embarked")
titanic_df<-titanic_df[,col_order] %>% arrange(PassengerId)
datatable(titanic_df, options = list(pageLength = 5))
Replacing all the NA values in groupFamSize with 0
titanic_df$groupFmSize[is.na(titanic_df$groupFmSize)]<-0
datatable(titanic_df, options = list(pageLength = 5))
Insights:
From the above we got complete data Now let us analyze which all value are missing
No missing value in data passenserId,Name,Sex,SibSp,Parch ,Ticket,Pclass Few Missing value in Age , Cabin and two missing value in Embarked and one missing value in fare
here i am ignoring few of attributes which is not much useful as we saw earlier like embarked,cabin(having more missing value) and fare as well
check the how many data is missing using the mice plot
library(VIM)
library(plotly)
mice_plot <- aggr(titanic_df[,c("Age")], col=c('blue','red'),
numbers=TRUE, sortVars=TRUE,
labels=c("Age")
, cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
Install Package install.packages(“Hmisc”)
filling the missing value in Age with the impute method taking the meadin of the data
library(Hmisc)
titanic_df$Age <- with(titanic_df, impute(Age, median))
#titanic_df[,c("Age")]
now all the missing value in age has been filled
As we discussed earlier, now Adding column AgeCategory will add if <18 child female Adult if age >18 and sex is female and male Adult if age>18 and sex is male
titanic_df$AgeCategory[titanic_df$Age <= 18] <-'child'
titanic_df$AgeCategory[titanic_df$Age > 18.00 & titanic_df$Sex =='female']<- 'FemaleAdult'
titanic_df$AgeCategory[titanic_df$Age > 18.00 & titanic_df$Sex =='male']<- 'maleAdult'
titanic_df$AgeCategory<-factor(titanic_df$AgeCategory)
#just to verify the data
datatable(titanic_df, options = list(pageLength = 5))
Now all the required data has been filled and resturtured so splitting again the data into training and test data
final_train_df<-titanic_df[1:nrow(train_df),]
final_test_df<-titanic_df[nrow(train_df)+1:nrow(test_df),]
datatable(final_train_df, options = list(pageLength = 5))
datatable(final_test_df, options = list(pageLength = 5))
predicting the model on training data here i am using decision tree to model as we have seen only few variable are significant so considering only with Pclass,Age,Sex,AgeCategory,FamgpSize
library("rpart")
library("rpart.plot")
rtree_fit <- rpart(Survived ~ Pclass+Age+factor(AgeCategory)+Sex+groupFmSize,
final_train_df,method="class")
summary(rtree_fit)
rpart.plot(rtree_fit,extra=104, box.palette="GnBu",
branch.lty=3, shadow.col="gray", nn=TRUE)
Insights
from the first level of tree shows out of total traveller only 38% survived while 62 % died and further only 19 % of male survievd however this ratio was more in case of women
library(MASS)
survivalPrediction <- predict(rtree_fit,final_test_df,type="class")
str(survivalPrediction)
tableP<-table(survivalPrediction)
pct <- round(tableP/sum(tableP) * 100)
label<-c("0","1")
lbls <- paste(label,'-',pct,'%') # add percents to labels
pie(tableP,col=c("red","yellow"),labels = lbls)
#save the data in CSV file
predicted_df <- data.frame(PassengerID = final_test_df$PassengerId, Survived = survivalPrediction)
head(predicted_df,200)
write.csv(predicted_df, file = 'Titanic_Prediction_Rpart.csv', row.names = F)
From the final data pie chart 65% people has been died while 35% has survived from disaster.
This is my first Exploration .Any improvments and suggestions always welcome: